Exploring Dataset¶

# import pandas
import pandas as pd 

# read a dataset 
df = pd.read_csv("../data/framingham.csv")

Head¶

# head: by default shows first 5 rows 
df.head() 

	male	age	education	currentSmoker	cigsPerDay	prevalentHyp	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
0	1	39	4.0	0	0.0	0	195.0	106.0	70.0	26.97	80.0	77.0	0
1	0	46	2.0	0	0.0	0	250.0	121.0	81.0	28.73	95.0	76.0	0
2	1	48	1.0	1	20.0	0	245.0	127.5	80.0	25.34	75.0	70.0	0
3	0	61	3.0	1	30.0	1	225.0	150.0	95.0	28.58	65.0	103.0	1
4	0	46	3.0	1	23.0	0	285.0	130.0	84.0	23.10	85.0	85.0	0

# head(n); n = 1, 2, 3, 4, ....
df.head(8)

	male	age	education	currentSmoker	cigsPerDay	prevalentHyp	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
0	1	39	4.0	0	0.0	0	195.0	106.0	70.0	26.97	80.0	77.0	0
1	0	46	2.0	0	0.0	0	250.0	121.0	81.0	28.73	95.0	76.0	0
2	1	48	1.0	1	20.0	0	245.0	127.5	80.0	25.34	75.0	70.0	0
3	0	61	3.0	1	30.0	1	225.0	150.0	95.0	28.58	65.0	103.0	1
4	0	46	3.0	1	23.0	0	285.0	130.0	84.0	23.10	85.0	85.0	0
5	0	43	2.0	0	0.0	1	228.0	180.0	110.0	30.30	77.0	99.0	0
6	0	63	1.0	0	0.0	0	205.0	138.0	71.0	33.11	60.0	85.0	1
7	0	45	2.0	1	20.0	0	313.0	100.0	71.0	21.68	79.0	78.0	0

Tail¶

# tail: by default shows last 5 rows 
df.tail() 

	male	age	education	currentSmoker	cigsPerDay	BPMeds	prevalentHyp	totChol	sysBP	diaBP	BMI	heartRate	glucose
4235	0	48	2.0	1	20.0	NaN	0	248.0	131.0	72.0	22.00	84.0	86.0
4236	0	44	1.0	1	15.0	0.0	0	210.0	126.5	87.0	19.16	86.0	NaN
4237	0	52	2.0	0	0.0	0.0	0	269.0	133.5	83.0	21.47	80.0	107.0
4238	1	40	3.0	0	0.0	0.0	1	185.0	141.0	98.0	25.60	67.0	72.0
4239	0	39	3.0	1	30.0	0.0	0	196.0	133.0	86.0	20.91	85.0	80.0

# tail(n); n=1, 2, 3, 4...
df.tail(8)

	male	age	education	currentSmoker	cigsPerDay	BPMeds	prevalentHyp	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
4232	1	68	1.0	0	0.0	0.0	1	176.0	168.0	97.0	23.14	60.0	79.0	1
4233	1	50	1.0	1	1.0	0.0	1	313.0	179.0	92.0	25.97	66.0	86.0	1
4234	1	51	3.0	1	43.0	0.0	0	207.0	126.5	80.0	19.71	65.0	68.0	0
4235	0	48	2.0	1	20.0	NaN	0	248.0	131.0	72.0	22.00	84.0	86.0	0
4236	0	44	1.0	1	15.0	0.0	0	210.0	126.5	87.0	19.16	86.0	NaN	0
4237	0	52	2.0	0	0.0	0.0	0	269.0	133.5	83.0	21.47	80.0	107.0	0
4238	1	40	3.0	0	0.0	0.0	1	185.0	141.0	98.0	25.60	67.0	72.0	0
4239	0	39	3.0	1	30.0	0.0	0	196.0	133.0	86.0	20.91	85.0	80.0	0

Columns Names¶

# Columns 
df.columns

Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

Observations and Variables(Rows and Columns)¶

# shape(rows x columns)
df.shape

(4240, 16)

Data Types¶

# check datatypes 
df.dtypes

male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

Basic Information¶

# info: it gives an overview of datasets 
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB

Numerical Summary of a Dataset¶

# describe: it gives summary statistics or five number summary 
df.describe() 

	male	age	education	currentSmoker	cigsPerDay	BPMeds	prevalentStroke	prevalentHyp	diabetes	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
count	4240.000000	4240.000000	4135.000000	4240.000000	4211.000000	4187.000000	4240.000000	4240.000000	4240.000000	4190.000000	4240.000000	4240.000000	4221.000000	4239.000000	3852.000000	4240.000000
mean	0.429245	49.580189	1.979444	0.494104	9.005937	0.029615	0.005896	0.310613	0.025708	236.699523	132.354599	82.897759	25.800801	75.878981	81.963655	0.151887
std	0.495027	8.572942	1.019791	0.500024	11.922462	0.169544	0.076569	0.462799	0.158280	44.591284	22.033300	11.910394	4.079840	12.025348	23.954335	0.358953
min	0.000000	32.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	107.000000	83.500000	48.000000	15.540000	44.000000	40.000000	0.000000
25%	0.000000	42.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	206.000000	117.000000	75.000000	23.070000	68.000000	71.000000	0.000000
50%	0.000000	49.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	234.000000	128.000000	82.000000	25.400000	75.000000	78.000000	0.000000
75%	1.000000	56.000000	3.000000	1.000000	20.000000	0.000000	0.000000	1.000000	0.000000	263.000000	144.000000	90.000000	28.040000	83.000000	87.000000	0.000000
max	1.000000	70.000000	4.000000	1.000000	70.000000	1.000000	1.000000	1.000000	1.000000	696.000000	295.000000	142.500000	56.800000	143.000000	394.000000	1.000000

# transpose table
df.describe().T

	count	mean	std	min	25%	50%	75%	max
male	4240.0	0.429245	0.495027	0.00	0.00	0.0	1.00	1.0
age	4240.0	49.580189	8.572942	32.00	42.00	49.0	56.00	70.0
education	4135.0	1.979444	1.019791	1.00	1.00	2.0	3.00	4.0
currentSmoker	4240.0	0.494104	0.500024	0.00	0.00	0.0	1.00	1.0
cigsPerDay	4211.0	9.005937	11.922462	0.00	0.00	0.0	20.00	70.0
BPMeds	4187.0	0.029615	0.169544	0.00	0.00	0.0	0.00	1.0
prevalentStroke	4240.0	0.005896	0.076569	0.00	0.00	0.0	0.00	1.0
prevalentHyp	4240.0	0.310613	0.462799	0.00	0.00	0.0	1.00	1.0
diabetes	4240.0	0.025708	0.158280	0.00	0.00	0.0	0.00	1.0
totChol	4190.0	236.699523	44.591284	107.00	206.00	234.0	263.00	696.0
sysBP	4240.0	132.354599	22.033300	83.50	117.00	128.0	144.00	295.0
diaBP	4240.0	82.897759	11.910394	48.00	75.00	82.0	90.00	142.5
BMI	4221.0	25.800801	4.079840	15.54	23.07	25.4	28.04	56.8
heartRate	4239.0	75.878981	12.025348	44.00	68.00	75.0	83.00	143.0
glucose	3852.0	81.963655	23.954335	40.00	71.00	78.0	87.00	394.0
TenYearCHD	4240.0	0.151887	0.358953	0.00	0.00	0.0	0.00	1.0

# for specific column 
df['age'].describe() 

count    4240.000000
mean       49.580189
std         8.572942
min        32.000000
25%        42.000000
50%        49.000000
75%        56.000000
max        70.000000
Name: age, dtype: float64

# for multiple columns 
df[['age', 'BMI']].describe() 

	age	BMI
count	4240.000000	4221.000000
mean	49.580189	25.800801
std	8.572942	4.079840
min	32.000000	15.540000
25%	42.000000	23.070000
50%	49.000000	25.400000
75%	56.000000	28.040000
max	70.000000	56.800000

Exploring Series¶

# read another dataset 
titanic = pd.read_csv('http://bit.ly/kaggletrain')

# examine first few rows 
titanic.head() 

Value Counts¶

# value_counts()
titanic['Sex'].value_counts() 

# value_counts() in percent
titanic['Sex'].value_counts(normalize=True) 

# returns a series 
type(titanic['Sex'].value_counts(normalize=True))

Unique()¶

# unique() 
titanic['Fare'].unique() 

# return a numpy.ndarray
type(titanic['Fare'].unique())

Cross Tabulation¶

# crosstab 
pd.crosstab(titanic['Sex'], titanic['Survived'])

Describe¶

# describe a categorical column 
titanic['Age'].describe() 

Basic Statistics¶

# mean()
titanic.Age.mean() 

# max()
titanic.Age.max() 

# min() 
titanic.Age.min() 

# median() 
titanic.Age.median() 

Visualization¶

%matplotlib inline

# barplot
titanic.Sex.value_counts().plot(kind="bar") 

# histogram
titanic.Age.plot(kind="hist")

Pandas for Data Analysis

Exploring Dataset¶

Head¶

Tail¶

Columns Names¶

Observations and Variables(Rows and Columns)¶

Data Types¶

Basic Information¶

Numerical Summary of a Dataset¶

Exploring Series¶

Value Counts¶

Unique()¶

Cross Tabulation¶

Describe¶

Basic Statistics¶

Visualization¶